Data integration with STACAS

Introduction

STACAS is a method for scRNA-seq integration. It is based on the Seurat integration framework, but adds important innovations:

  • anchors can be filtered/down-weighted based on their distance in reciprocal PCA space, calculated from the unscaled, normalized data
  • integration trees are constructed based on the ‘centrality’ of datasets, as measured by the sum of their anchor scores
  • cell type labels, if known, can be given as input to the algorithm to perform semi-supervised integration

In this demo we will show the application of STACAS to integrate a collection of PBMC datasets, used also in the excellent, comprehensive benchmark by Luecken et al.. The data are available at: figshare/12420968

R environment

Get and load some useful packages

renv::restore()

BiocManager::install('remotes')
BiocManager::install('zellkonverter')

library(remotes)
install_github("https://github.com/carmonalab/scIntegrationMetrics")
install_github("https://github.com/carmonalab/STACAS")
library(Seurat)
library(dplyr)
library(ggplot2)
library(scIntegrationMetrics)
library(STACAS)
library(zellkonverter)

options(future.globals.maxSize= 8000*1024^2, future.seed=TRUE)
seed = 1234
set.seed(seed)

Load test datasets

Download the collection of PBMC datasets assembled by Luecken et al., and convert them to Seurat objects.

download <- T
where <- 'aux'
dir.create(where, showWarnings = FALSE)

rds.path <- sprintf("%s/Immune_ALL_human.rds", where)

if(download){
  options(timeout=500)
  url <- "https://figshare.com/ndownloader/files/25717328"
  h5.path <- sprintf("%s/Immune_ALL_human.rds", where)
  download.file(url = url, destfile = h5.path)
  
  #Convert to Seurat 
  luecken.sce <- zellkonverter::readH5AD(h5.path)
  luecken.seurat <- Seurat::as.Seurat(luecken.sce, counts = "counts", data = "X")
  luecken.seurat <- RenameAssays(luecken.seurat, originalexp="RNA")
  rm (luecken.sce)
  
  Idents(luecken.seurat) <- "final_annotation"
  saveRDS(object = luecken.seurat,file = rds.path)

}else{
  luecken.seurat <- readRDS(rds.path)
}
## Renaming default assay from originalexp to RNA
## Warning: Cannot add objects with duplicate keys (offending key: originalexp_)
## setting key to original value 'rna_'

Cell types were annotated by the authors on each dataset individually, using a common dictionary of cell types (see https://github.com/theislab/scib-reproducibility)

table(luecken.seurat$final_annotation, luecken.seurat$batch)[,1:5]
##                                   
##                                     10X Freytag Oetjen_A Oetjen_P Oetjen_U
##   CD4+ T cells                     2937    1238      577      295     1652
##   CD8+ T cells                      350     270       93      639      253
##   CD10+ B cells                       0       0       91       41       75
##   CD14+ Monocytes                  3388     452      350      263      384
##   CD16+ Monocytes                   364      25       61       26       78
##   CD20+ B cells                    1546     427       43       31      417
##   Erythrocytes                        0       0      186     1219       97
##   Erythroid progenitors               0       0      112      249      102
##   HSPCs                              28       0      227      119       99
##   Megakaryocyte progenitors          21      16       72       71       76
##   Monocyte progenitors                0       0      183      109      136
##   Monocyte-derived dendritic cells  182       0       89       60       65
##   NK cells                          756     476       26        0       63
##   NKT cells                        1056     432      389       62      157
##   Plasma cells                       18       0       33       40       38
##   Plasmacytoid dendritic cells       81      11       54       41       38

How does the collection of dataset look without any integration?

nfeatures <- 2000
ndim <- 50

luecken.seurat <- FindVariableFeatures(luecken.seurat, nfeatures = nfeatures)
luecken.seurat <- luecken.seurat %>% NormalizeData() %>% ScaleData() 
luecken.seurat <- luecken.seurat %>% RunPCA(npcs=ndim) %>% RunUMAP(dims=1:ndim)
a <- DimPlot(luecken.seurat, group.by = "batch") + theme(aspect.ratio = 1) + ggtitle("Before integration")
b <- DimPlot(luecken.seurat, group.by = "final_annotation", label=T, label.size = 4) + theme(aspect.ratio = 1)

a | b

Cells mostly cluster by the cell types annotated in individual datasets, but there are batch effects (=dataset-specific signals). We can try to quantify how much by LISI (Local Inverse Simpson’s Index)[https://www.nature.com/articles/s41592-019-0619-0] and silhouette coefficients (aka Average Silhouette Width).

red <- "pca"
red.ndim <- 10
embeds <- luecken.seurat@reductions[[red]]@cell.embeddings[,1:red.ndim]
meta <- luecken.seurat@meta.data

features <- c("final_annotation","batch")

res.lisi <- compute_lisi(embeds, meta_data = meta, label_colnames=features)

res.lisi.avg_init <- apply(res.lisi,2,mean)
res.lisi.avg_init
## final_annotation            batch 
##         1.360443         2.601625
res.sil <- compute_silhouette(embeds, meta_data = meta, label_colnames=features)

res.sil.avg_init <- apply(res.sil,2,mean)
res.sil.avg_init
## final_annotation            batch 
##       0.10399608      -0.08924522

STACAS integration - unsupervised

Split data by sample

obj.list <- SplitObject(luecken.seurat, split.by = "batch")

Calculate variable features in each batch/sample

For simplicity, here we use default parameters for finding variable genes (HVG). However, we recommend excluding from HVGs all genes that may be sensitive to technical artifacts (rather than being markers for cell types), such as mitochondrial genes, heat-shock proteins, cell cycling genes, etc. See Note 2 at the end of this document for more details.

nFeatures <- 2000
nIntFeatures <- 2000
ndim <- 50

obj.list <- lapply(obj.list, function(x) {
      FindVariableFeatures.STACAS(x, nfeat = nFeatures)
})

anchor.features <- SelectIntegrationFeatures(obj.list, nfeatures = nIntFeatures)

Find integration anchors with STACAS

stacas_anchors <- FindAnchors.STACAS(obj.list,
                                     anchor.features = anchor.features,
                                     dims = 1:ndim)
##  1/10 2/10 3/10 4/10 5/10 6/10 7/10 8/10 9/10 10/10
## Finding integration anchors...

Integration order

st1 <- SampleTree.STACAS(
  anchorset = stacas_anchors,
  obj.names = names(obj.list)
  )

Finally, perform dataset integration

integrated.data.unsup <- IntegrateData.STACAS(stacas_anchors, dims=1:ndim, sample.tree=st1,
                                 features.to.integrate=stacas_anchors@anchor.features)

Calculate low-dimensional embeddings and visualize integration results in UMAP

DefaultAssay(integrated.data.unsup) <- "integrated"
integrated.data.unsup <- integrated.data.unsup %>% ScaleData() %>% RunPCA(npcs=ndim)
integrated.data.unsup <- RunUMAP(integrated.data.unsup, dims=1:ndim)
p1 <- DimPlot(integrated.data.unsup, group.by = "batch") + theme(aspect.ratio = 1) + ggtitle("Unsupervised")
p2 <- DimPlot(integrated.data.unsup, group.by = "final_annotation", label=T, label.size = 2) + theme(aspect.ratio = 1)

p1 | p2

LISI and silhouette integration quality metrics on integrated data

red <- "pca"
red.ndim <- 10
embeds <- integrated.data.unsup@reductions[[red]]@cell.embeddings[,1:red.ndim]
meta <- integrated.data.unsup@meta.data
features <- c("final_annotation","batch")

res.lisi <- compute_lisi(embeds, meta_data = meta, label_colnames=features, perplexity = 30)

res.lisi.avg_unsup <- apply(res.lisi,2,mean)
res.lisi.avg_unsup
## final_annotation            batch 
##         1.263123         3.523320
res.sil <- compute_silhouette(embeds, meta_data = meta, label_colnames=features)

res.sil.avg_unsup <- apply(res.sil,2,mean)
res.sil.avg_unsup
## final_annotation            batch 
##        0.2283455       -0.1463899

STACAS integration - semi-supervised

When available, cell type annotations can be used to guide the alignment. STACAS will use this information to penalize anchors where cell types are inconsistent (you can decide how much weight to give to cell labels with the label.confidence parameter). Note that not all cells must be annotated: we recommend to set labels to NA or unknown for cells that cannot be confidently annotated, and they won’t be penalized for label inconsistency.

In this collection of datasets, cell types were annotated by the authors of the benchmark. For your own data, you may want to do manual annotation or apply one of several cell annotation tools, such as SingleR, Garnett or scGate. We will be posting some examples of cell type annotation in a different demo.

stacas_anchors <- FindAnchors.STACAS(obj.list,
                                     anchor.features = anchor.features,  
                                     dims = 1:ndim,
                                     label.confidence = 1,
                                     cell.labels = "final_annotation")
##  1/10 2/10 3/10 4/10 5/10 6/10 7/10 8/10 9/10 10/10
## Finding integration anchors...

Recompute integration tree with the new set of anchors.

st2 <- SampleTree.STACAS(
  anchorset = stacas_anchors,
  obj.names = names(obj.list)
  )
## Building integration tree with base dataset: Oetjen_U

And finally integrate the datasets using the anchor set and integration tree.

integrated.data.ss <- IntegrateData.STACAS(stacas_anchors, dims=1:ndim, sample.tree=st2,
                                 features.to.integrate=stacas_anchors@anchor.features,
                                 semisupervised=TRUE)

Calculate low-dimensional embeddings and see integration results in UMAP space

DefaultAssay(integrated.data.ss) <- "integrated"
integrated.data.ss <- integrated.data.ss %>% ScaleData() %>% RunPCA(npcs=ndim)
integrated.data.ss <- RunUMAP(integrated.data.ss, dims=1:ndim)
p1 <- DimPlot(integrated.data.ss, group.by = "batch") + theme(aspect.ratio = 1) + ggtitle("Semisupervised")
p2 <- DimPlot(integrated.data.ss, group.by = "final_annotation", label=T, label.size = 2) + theme(aspect.ratio = 1)

p1 | p2

LISI and silhouette

red <- "pca"
red.ndim <- 10
embeds <- integrated.data.ss@reductions[[red]]@cell.embeddings[,1:red.ndim]
meta <- integrated.data.ss@meta.data
features <- c("final_annotation","batch")

res.lisi <- compute_lisi(embeds, meta_data = meta, label_colnames=features)

print("lisi")
## [1] "lisi"
res.lisi.avg_ss <- apply(res.lisi,2,mean)
res.lisi.avg_ss
## final_annotation            batch 
##         1.220149         3.497174
res.sil <- compute_silhouette(embeds, meta_data = meta, label_colnames=features)

print("sil")
## [1] "sil"
res.sil.avg_ss <- apply(res.sil,2,mean)
res.sil.avg_ss
## final_annotation            batch 
##        0.2554488       -0.1407194

Integration metrics summary

We have calculated LISI and silhouette coefficients based on the cell type annotations (“final_annotation”) and on the sample of origin (“batch”). These correspond respectively to cluster metrics (cLISI and cSil) and to batch mixing metrics (iLISI and iSil), which can be interpreted as follows:

  • iLISI measures the local mixing of datasets. It should be high for well mixed datasets (up to the number of datasets). However, when cell types are only present in one or few studies, high iLISI for these cell types would in fact be undesiderable, and indicative of overcorrection.

  • cLISI measures the local mixing of cell labels (or clusters, or cell types), and it should be as close as possible to 1. It tells how many different labels are found around each cell on average. This measure is sensitive to the size of the groups, and does not account for repeated structures in the data (i.e. it is a local measure of mixing)

  • cSil is the silhouette coefficient for cell labels (or clusters, or cell types). It measures the average distance between cells with the same label (= cell type), compared to cells with different label. It is a global measure and should be good to evaluate the quality of integration in terms of cell annotations.

  • iSil is the silhouette coefficient per dataset. Not straightforward to interpret.

summary <- data.frame(matrix(ncol = 4, nrow = 3))
colnames(summary) <- c("cLISI","iLISI","cSil","iSil")
rownames(summary) <- c("Unintegrated","Unsupervised","Semisup")

summary["Unintegrated","cLISI"] <- res.lisi.avg_init[1]
summary["Unintegrated","iLISI"] <- res.lisi.avg_init[2]
summary["Unintegrated","cSil"] <- res.sil.avg_init[1]
summary["Unintegrated","iSil"] <- res.sil.avg_init[2]

summary["Unsupervised","cLISI"] <- res.lisi.avg_unsup[1]
summary["Unsupervised","iLISI"] <- res.lisi.avg_unsup[2]
summary["Unsupervised","cSil"] <- res.sil.avg_unsup[1]
summary["Unsupervised","iSil"] <- res.sil.avg_unsup[2]

summary["Semisup","cLISI"] <- res.lisi.avg_ss[1]
summary["Semisup","iLISI"] <- res.lisi.avg_ss[2]
summary["Semisup","cSil"] <- res.sil.avg_ss[1]
summary["Semisup","iSil"] <- res.sil.avg_ss[2]

summary
##                 cLISI    iLISI      cSil        iSil
## Unintegrated 1.360443 2.601625 0.1039961 -0.08924522
## Unsupervised 1.263123 3.523320 0.2283455 -0.14638987
## Semisup      1.220149 3.497174 0.2554488 -0.14071935

The most informative metrics are probably the iLISI (degree of dataset mixing) and cSil (distance to own cluster).

In summary, we see that integration improves the mixing of datasets (increased iLISI compared to unintegrated data), while also improving the internal distances between cells assigned to the same label (increased cSil). Semi-supervised alignment can further help in bringing cell of the same kind together in the final integrated space.

m <- reshape2::melt(t(summary[,c("iLISI","cSil")]))
colnames(m) <- c("Metric","Method","value")
pal <- c("#999999", "#E69F00", "#56B4E9")

a <- ggplot(m[m$Metric=="iLISI",], aes(x=Method, y=value, fill=Method)) +
  ggtitle("iLISI") + scale_fill_manual(values=pal) +
  geom_bar(stat="identity") + theme_bw() + theme(legend.position = "none")

b <- ggplot(m[m$Metric=="cSil",], aes(x=Method, y=value, fill=Method)) +
  ggtitle("cSil") + scale_fill_manual(values=pal) +
  geom_bar(stat="identity") + theme_bw()

a | b

Important notes

  1. The first critical step in batch effect correction is to ensure that the gene names in your datasets are harmonized. If different studies use different naming conventions, synonyms, etc., methods for dimensionality reduction will treat these genes as entirely different variables, and introducing artificial differences between datasets. It is recommended that your pre-processing pipeline takes care of resolving ambiguities in gene naming across datasets.

  2. The calculation of highly variable genes (HGV) is a fundamental step for dimensionality reduction (and for integration based on low dimensional representations). We recommend excluding certain classes of genes, e.g. mitochondrial, ribosomal, heat-shock genes, from the list of variable genes, as their expression may be more related to technical variabilites rather than to cell type identity. The FindVariableGenes.STACAS() function allows providing a list of genes to exclude from HVG; see an example below.

We can use the collection of signatures stored in SignatuR package

remotes::install_github("https://github.com/carmonalab/SignatuR")
library(SignatuR)
print(SignatuR$Hs)
##                  levelName
## 1  Hs                     
## 2   ¦--Blocklists         
## 3   ¦   ¦--Pseudogenes    
## 4   ¦   ¦--HSP            
## 5   ¦   °--Non-coding     
## 6   ¦--Cell_types         
## 7   ¦--Programs           
## 8   ¦   ¦--cellCycle.G1S  
## 9   ¦   ¦--cellCycle.G2M  
## 10  ¦   °--IFN            
## 11  °--Compartments       
## 12      ¦--Mito           
## 13      ¦--Ribo           
## 14      ¦--TCR            
## 15      °--Immunoglobulins

Then we can calculate HVG for STACAS excluding specific gene sets

my.genes.blocklist <- c(GetSignature(SignatuR$Hs$Blocklists),
                        GetSignature(SignatuR$Hs$Programs),
                        GetSignature(SignatuR$Hs$Compartments))

obj.list <- lapply(obj.list, function(x) {
    FindVariableFeatures.STACAS(x, nfeat = 2000, genesBlockList = my.genes.blocklist)

})

anchor.features <- SelectIntegrationFeatures(obj.list, nfeatures = nIntFeatures)

Further reading

The STACAS package and installation instructions are available at: STACAS package

The code for this demo can be found on GitHub

References

  • Andreatta A., Carmona S. J. (2021). STACAS: Sub-Type Anchor Correction for Alignment in Seurat to integrate single-cell RNA-seq data. - Bioinformatics

  • Luecken, M. D., Büttner, M., Chaichoompu, K., Danese, A., Interlandi, M., Müller, M. F., … & Theis, F. J. (2022). Benchmarking atlas-level data integration in single-cell genomics. - Nature methods

  • Hao, Y., Hao, S., Andersen-Nissen, E., Mauck III, W. M., Zheng, S., Butler, A., … & Satija, R. (2021). Integrated analysis of multimodal single-cell data. - Cell